# Copyright 2022 The IREE Authors
#
# Licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

# End-to-end matrix multiplication tests.

load("//build_tools/bazel:iree_trace_runner_test.bzl", "iree_generated_trace_runner_test")

package(
    features = ["layering_check"],
    licenses = ["notice"],  # Apache 2.0
)

py_binary(
    name = "generate_e2e_matmul_tests",
    srcs = ["generate_e2e_matmul_tests.py"],
)

###########################################################################
##
## LLVMCPU backend
##
###########################################################################

# LLVMCPU, non-data-tiling, no microkernels
[iree_generated_trace_runner_test(
    name = "e2e_matmul_nondt_%s_%s_%s" % (lhs_rhs_type, acc_type, size),
    compiler_flags = [
        "--iree-opt-data-tiling=false",
        "--iree-llvmcpu-enable-ukernels=none",
    ],
    generator = ":generate_e2e_matmul_tests",
    generator_args = [
        "--lhs_rhs_type=%s" % lhs_rhs_type,
        "--acc_type=%s" % acc_type,
        "--shapes=%s" % size,
    ],
    tags = [
        # f16/bf16 trigger internal LLVM assertion errors on riscv and wasm.
        "noriscv",
        "nowasm",
    ] if (lhs_rhs_type == "f16" or lhs_rhs_type == "bf16") else [],
    target_backends_and_drivers = [
        ("llvm-cpu", "local-task"),
    ],
    target_cpu_features_variants = ["default"] +
                                   # Widening matmuls fail to lower for SVE.
                                   (["arm_64:sve:+sve"] if lhs_rhs_type == acc_type else []),
    trace_runner = "//tools:iree-e2e-matmul-test",
) for (lhs_rhs_type, acc_type) in [
    # ("i8", "i32"),  # TODO(#15800): enable once compile time is reasonable
    # ("f32", "f32"),  # TODO(#15800): enable once compile time is reasonable
    # ("f16", "f16"),  # TODO(#15800): enable once compile time is reasonable
    # ("f16", "f32"),  # TODO(#15800): enable once compile time is reasonable
    # TODO(#15258): enable bf16 tests when that bug is fixed.
    # ("bf16", "bf16"),
    # ("bf16", "f32"),
] for size in [
    "small",
    "large",
]]

# LLVMCPU, non-data-tiling, no microkernels, ArmSME
[iree_generated_trace_runner_test(
    name = "e2e_matmul_arm_sme_nondt_%s_%s" % (dtype, size),
    compiler_flags = [
        "--iree-opt-data-tiling=false",
    ],
    generator = ":generate_e2e_matmul_tests",
    generator_args = [
        "--lhs_rhs_type=%s" % dtype,
        "--acc_type=%s" % dtype,
        "--shapes=%s" % size,
    ],
    target_backends_and_drivers = [
        ("llvm-cpu", "local-task"),
    ],
    target_cpu_features_variants = ["arm_64:sme:+sve,+sme"],
    trace_runner = "//tools:iree-e2e-matmul-test",
) for dtype in [
    "f32",
    # "f64" (also supported for ArmSME, but not by the test generator)
] for size in [
    "small",
    "large",
]]

X86_64_AVX2 = [
    "+avx",
    "+avx2",
    "+fma",
    "+f16c",
]

X86_64_AVX512 = X86_64_AVX2 + [
    "+avx512f",
    "+avx512vl",
    "+avx512cd",
    "+avx512bw",
    "+avx512dq",
]

X86_64_AVX512_VNNI = X86_64_AVX512 + [
    "+avx512vnni",
]

X86_64_AVX512_BF16 = X86_64_AVX512 + [
    "+avx512bf16",
]

# LLVMCPU, data-tiling, data-tiling + ukernels.
[iree_generated_trace_runner_test(
    name = "e2e_matmul_dt%s_%s_%s_%s" % (
        ("_uk" if use_uk else ""),
        lhs_rhs_type,
        acc_type,
        size,
    ),
    compiler_flags = [
        "--iree-opt-data-tiling",
    ] + ["--iree-llvmcpu-enable-ukernels=%s" % ("all" if use_uk else "none")],
    generator = ":generate_e2e_matmul_tests",
    generator_args = [
        "--lhs_rhs_type=%s" % lhs_rhs_type,
        "--acc_type=%s" % acc_type,
        "--shapes=%s" % size,
    ],
    tags = ([
        # "--shapes=large" can cause timeouts on sanitizers.
        "noasan",
        "notsan",
    ] if size == "large" else []) + ([
        # "--shapes=large" can cause timeouts on RISC-V emulator.
        # f16/bf16 trigger internal LLVM assertion errors on riscv and wasm.
        "noriscv",
        "nowasm",
    ] if (lhs_rhs_type == "f16" or lhs_rhs_type == "bf16") else []),
    target_backends_and_drivers = [
        ("llvm-cpu", "local-task"),
    ],
    target_cpu_features_variants = ["default"] +
                                   ([
                                       "arm_64:dotprod:+dotprod",
                                       "arm_64:i8mm:+i8mm",
                                       "x86_64:avx512vnni:" + ",".join(X86_64_AVX512_VNNI),
                                   ] if lhs_rhs_type == "i8" and acc_type == "i32" else [
                                       "x86_64:avx2:" + ",".join(X86_64_AVX2),
                                       "x86_64:avx512:" + ",".join(X86_64_AVX512),
                                   ] if lhs_rhs_type == "f32" and acc_type == "f32" else [
                                       "x86_64:avx2:" + ",".join(X86_64_AVX2),
                                       "x86_64:avx512:" + ",".join(X86_64_AVX512),
                                       "arm_64:fullfp16:+fullfp16",
                                   ] if lhs_rhs_type == "f16" and acc_type == "f16" else [
                                       "x86_64:avx2:" + ",".join(X86_64_AVX2),
                                       "x86_64:avx512:" + ",".join(X86_64_AVX512),
                                       "arm_64:fp16fml:+fp16fml",
                                   ] if lhs_rhs_type == "f16" and acc_type == "f32" else [
                                       "x86_64:avx2:" + ",".join(X86_64_AVX2),
                                       "x86_64:avx512:" + ",".join(X86_64_AVX512),
                                       "x86_64:avx512bf16:" + ",".join(X86_64_AVX512_BF16),
                                       "arm_64:bf16:+bf16",
                                   ] if lhs_rhs_type == "bf16" and acc_type == "bf16" else [
                                       "x86_64:avx2:" + ",".join(X86_64_AVX2),
                                       "x86_64:avx512:" + ",".join(X86_64_AVX512),
                                       "x86_64:avx512bf16:" + ",".join(X86_64_AVX512_BF16),
                                       "arm_64:bf16:+bf16",
                                   ] if lhs_rhs_type == "bf16" and acc_type == "f32" else []),
    trace_runner = "//tools:iree-e2e-matmul-test",
) for use_uk in [
    False,
    True,
] for (lhs_rhs_type, acc_type) in (
    [
        ("i8", "i32"),
        ("f32", "f32"),
        ("f16", "f16"),
        ("f16", "f32"),
        ("bf16", "bf16"),
        ("bf16", "f32"),
    ]
) for size in [
    "small",
    "large",
]]

###########################################################################
##
## VMVX backend
##
###########################################################################

# VMVX, data-tiling + microkernels.
[iree_generated_trace_runner_test(
    name = "e2e_matmul_dt_uk_%s_small" % lhs_rhs_type,
    compiler_flags = [
        "--iree-vmvx-enable-microkernels",
        "--iree-opt-data-tiling",
    ],
    generator = ":generate_e2e_matmul_tests",
    generator_args = [
        "--lhs_rhs_type=%s" % lhs_rhs_type,
        "--shapes=small",
    ],
    target_backends_and_drivers = [
        ("vmvx", "local-task"),
    ],
    trace_runner = "//tools:iree-e2e-matmul-test",
) for lhs_rhs_type in [
    "i8",
    "f32",
]]

###########################################################################
##
## CUDA backend
##
###########################################################################

iree_generated_trace_runner_test(
    name = "e2e_matmul_direct_f32_gpu_large_LLVMGPUMatmulSimt",
    generator = ":generate_e2e_matmul_tests",
    generator_args = [
        "--lhs_rhs_type=f32",
        "--shapes=gpu_large_aligned",
        "--compilation_info=LLVMGPUMatmulSimt",
    ],
    tags = [
        # CUDA cuInit fails with sanitizer on.
        "noasan",
        "nomsan",
        "notsan",
        "noubsan",
        "requires-gpu-nvidia",
    ],
    target_backends_and_drivers = [
        ("cuda", "cuda"),
    ],
    trace_runner = "//tools:iree-e2e-matmul-test",
)

# Testing Ampere + TensorCore path.
# WMMA TensorCore(F32): wmma.161616.f32.tf32
iree_generated_trace_runner_test(
    name = "e2e_matmul_direct_f32_gpu_large_LLVMGPUMatmulTensorCore",
    compiler_flags = [
        "--iree-hal-cuda-llvm-target-arch=sm_80",
    ],
    generator = ":generate_e2e_matmul_tests",
    generator_args = [
        "--lhs_rhs_type=f32",
        "--shapes=gpu_large_aligned",
        "--compilation_info=LLVMGPUMatmulTensorCore",
    ],
    tags = [
        # CUDA cuInit fails with sanitizer on.
        "noasan",
        "nomsan",
        "notsan",
        "noubsan",
        "requires-gpu-sm80",
    ],
    target_backends_and_drivers = [
        ("cuda", "cuda"),
    ],
    trace_runner = "//tools:iree-e2e-matmul-test",
)

iree_generated_trace_runner_test(
    name = "e2e_matmul_direct_f32_gpu_large_unaligned",
    compiler_flags = [
        "--iree-hal-cuda-llvm-target-arch=sm_80",
    ],
    generator = ":generate_e2e_matmul_tests",
    generator_args = [
        "--lhs_rhs_type=f32",
        "--shapes=gpu_large",
    ],
    tags = [
        # CUDA cuInit fails with sanitizer on.
        "noasan",
        "nomsan",
        "notsan",
        "noubsan",
        "requires-gpu-sm80",
    ],
    target_backends_and_drivers = [
        ("cuda", "cuda"),
    ],
    trace_runner = "//tools:iree-e2e-matmul-test",
)

iree_generated_trace_runner_test(
    name = "e2e_matmul_direct_f16_gpu_large_unaligned",
    compiler_flags = [
        "--iree-hal-cuda-llvm-target-arch=sm_80",
    ],
    generator = ":generate_e2e_matmul_tests",
    generator_args = [
        "--lhs_rhs_type=f16",
        "--shapes=gpu_large",
    ],
    tags = [
        # CUDA cuInit fails with sanitizer on.
        "noasan",
        "nomsan",
        "notsan",
        "noubsan",
        "requires-gpu-sm80",
    ],
    target_backends_and_drivers = [
        ("cuda", "cuda"),
    ],
    trace_runner = "//tools:iree-e2e-matmul-test",
)

# MMA.SYNC TensorCore(F32): mma.sync.1688.f32.t32
iree_generated_trace_runner_test(
    name = "e2e_matmul_direct_f32_gpu_large_mma_sync_LLVMGPUMatmulTensorCoreMmaSync",
    compiler_flags = [
        "--iree-hal-cuda-llvm-target-arch=sm_80",
    ],
    generator = ":generate_e2e_matmul_tests",
    generator_args = [
        "--lhs_rhs_type=f32",
        "--shapes=gpu_large_aligned",
        "--compilation_info=LLVMGPUMatmulTensorCoreMmaSync",
    ],
    tags = [
        # CUDA cuInit fails with sanitizer on.
        "noasan",
        "nomsan",
        "notsan",
        "noubsan",
        "requires-gpu-sm80",
    ],
    target_backends_and_drivers = [
        ("cuda", "cuda"),
    ],
    trace_runner = "//tools:iree-e2e-matmul-test",
)

# WMMA TensorCore(F16): wmma.161616.f16.f16
iree_generated_trace_runner_test(
    name = "e2e_matmul_direct_f16_gpu_large_LLVMGPUMatmulTensorCore",
    compiler_flags = [
        "--iree-hal-cuda-llvm-target-arch=sm_80",
    ],
    generator = ":generate_e2e_matmul_tests",
    generator_args = [
        "--lhs_rhs_type=f16",
        "--shapes=gpu_large_aligned",
        "--compilation_info=LLVMGPUMatmulTensorCore",
    ],
    tags = [
        # CUDA cuInit fails with sanitizer on.
        "noasan",
        "nomsan",
        "notsan",
        "noubsan",
        "requires-gpu-sm80",
    ],
    target_backends_and_drivers = [
        ("cuda", "cuda"),
    ],
    trace_runner = "//tools:iree-e2e-matmul-test",
)

# MMA.SYNC TensorCore(F16): mma.sync.161616.f16.f16
iree_generated_trace_runner_test(
    name = "e2e_matmul_direct_f16_gpu_large_mma_sync_LLVMGPUMatmulTensorCoreMmaSync",
    compiler_flags = [
        "--iree-hal-cuda-llvm-target-arch=sm_80",
    ],
    generator = ":generate_e2e_matmul_tests",
    generator_args = [
        "--lhs_rhs_type=f16",
        "--shapes=gpu_large_aligned",
        "--compilation_info=LLVMGPUMatmulTensorCoreMmaSync",
    ],
    tags = [
        # CUDA cuInit fails with sanitizer on.
        "noasan",
        "nomsan",
        "notsan",
        "noubsan",
        "requires-gpu-sm80",
    ],
    target_backends_and_drivers = [
        ("cuda", "cuda"),
    ],
    trace_runner = "//tools:iree-e2e-matmul-test",
)

[iree_generated_trace_runner_test(
    name = "e2e_matmul_direct_%s_large_split_k" % lhs_rhs_type,
    compiler_flags = [
        "--iree-flow-split-matmul-reduction=4",
    ],
    generator = ":generate_e2e_matmul_tests",
    generator_args = [
        "--lhs_rhs_type=%s" % lhs_rhs_type,
        "--shapes=large",
    ],
    tags = [
        # CUDA cuInit fails with sanitizer on.
        "noasan",
        "nomsan",
        "notsan",
        "noubsan",
        "requires-gpu-nvidia",
        # "--shapes=large" can cause timeouts on riscv emulator.
        "noriscv",
    ],
    target_backends_and_drivers = [
        ("cuda", "cuda"),
    ],
    trace_runner = "//tools:iree-e2e-matmul-test",
) for lhs_rhs_type in [
    "f32",
]]

###########################################################################
##
## Vulkan backend
##
###########################################################################

[iree_generated_trace_runner_test(
    name = "e2e_matmul_direct_{0}_gpu_large_valhall".format(lhs_rhs_type),
    compiler_flags = [
        "--iree-vulkan-target-triple=valhall-unknown-android31",
    ],
    generator = ":generate_e2e_matmul_tests",
    generator_args = [
        "--lhs_rhs_type=%s" % lhs_rhs_type,
        "--shapes=gpu_large_aligned",
        "--compilation_info=SPIRVVectorizeMali",
    ],
    tags = [
        # Nvidia GPUs support a superset of Valhall features
        "requires-gpu-nvidia",
        "vulkan_uses_vk_khr_shader_float16_int8",
    ],
    target_backends_and_drivers = [
        ("vulkan-spirv", "vulkan"),
    ],
    trace_runner = "//tools:iree-e2e-matmul-test",
) for lhs_rhs_type in [
    "i8",
    "f16",
    "f32",
]]

[iree_generated_trace_runner_test(
    name = "e2e_matmul_direct_{0}_gpu_large_ampere".format(lhs_rhs_type),
    compiler_flags = [
        "--iree-vulkan-target-triple=ampere-unknown-linux",
    ],
    generator = ":generate_e2e_matmul_tests",
    generator_args = [
        "--lhs_rhs_type=%s" % lhs_rhs_type,
        "--shapes=gpu_large_aligned",
        "--compilation_info=SPIRVVectorizeNVIDIA",
    ],
    tags = [
        "requires-gpu-sm80",
        "vulkan_uses_vk_khr_shader_float16_int8",
    ],
    target_backends_and_drivers = [
        ("vulkan-spirv", "vulkan"),
    ],
    trace_runner = "//tools:iree-e2e-matmul-test",
) for lhs_rhs_type in [
    "i8",
    "f16",
    "f32",
]]

iree_generated_trace_runner_test(
    name = "e2e_matmul_direct_f16_gpu_large_rdna3",
    compiler_flags = [
        "--iree-vulkan-target-triple=rdna3-unknown-linux",
    ],
    generator = ":generate_e2e_matmul_tests",
    generator_args = [
        "--lhs_rhs_type=f16",
        "--shapes=gpu_large_aligned",
        "--compilation_info=SPIRVCooperativeMatrixVectorize",
    ],
    runner_args = [
        "--require_exact_results=false",
    ],
    tags = [
        "requires-gpu",
        "requires-gpu-rdna3",
        "vulkan_uses_vk_khr_shader_float16_int8",
    ],
    target_backends_and_drivers = [
        ("vulkan-spirv", "vulkan"),
    ],
    trace_runner = "//tools:iree-e2e-matmul-test",
)
